library(tidyverse)
## Warning: пакет 'ggplot2' был собран под R версии 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
life_expectancy_data <- readr::read_rds('life_expectancy_data.RDS')
life_expectancy_data %>% glimpse()
## Rows: 195
## Columns: 23
## $ Country <chr> "Afghanistan", "Albania", "A…
## $ Year <int> 2019, 2019, 2019, 2019, 2019…
## $ Gender <chr> "Female", "Female", "Female"…
## $ `Life expectancy` <dbl> 66.388, 80.201, 78.133, 64.0…
## $ Unemployment <dbl> 14.065000, 11.322000, 18.629…
## $ `Infant Mortality` <dbl> 42.90000, 7.70000, 18.60000,…
## $ GDP <dbl> 1.879945e+10, 1.540024e+10, …
## $ GNI <dbl> 1.909831e+10, 1.519866e+10, …
## $ `Clean fuels and cooking technologies` <dbl> 36.00000, 80.70000, 99.30000…
## $ `Per Capita` <dbl> 494.1793, 5395.6595, 3989.66…
## $ `Mortality caused by road traffic injury` <dbl> 15.90000, 11.70000, 20.90000…
## $ `Tuberculosis Incidence` <dbl> 189.0, 16.0, 61.0, 351.0, 0.…
## $ `DPT Immunization` <dbl> 66.00000, 99.00000, 91.00000…
## $ `HepB3 Immunization` <dbl> 66.00000, 99.00000, 91.00000…
## $ `Measles Immunization` <dbl> 64.00000, 95.00000, 80.00000…
## $ `Hospital beds` <dbl> 0.4322222, 3.0523077, 1.8000…
## $ `Basic sanitation services` <dbl> 49.00617, 99.18307, 86.13850…
## $ `Tuberculosis treatment` <dbl> 91.00000, 88.00000, 86.00000…
## $ `Urban population` <dbl> 25.754, 61.229, 73.189, 66.1…
## $ `Rural population` <dbl> 74.246, 38.771, 26.811, 33.8…
## $ `Non-communicable Mortality` <dbl> 36.20000, 6.00000, 12.80000,…
## $ `Sucide Rate` <dbl> 3.60000, 2.70000, 1.80000, 2…
## $ continent <fct> Asia, Europe, Africa, Africa…
life_expectancy_data %>% summary()
## Country Year Gender Life expectancy
## Length:195 Min. :2019 Length:195 Min. :55.49
## Class :character 1st Qu.:2019 Class :character 1st Qu.:70.02
## Mode :character Median :2019 Mode :character Median :77.55
## Mean :2019 Mean :75.52
## 3rd Qu.:2019 3rd Qu.:80.95
## Max. :2019 Max. :88.10
## Unemployment Infant Mortality GDP GNI
## Min. : 0.178 Min. : 1.40 Min. :1.884e+08 Min. :3.754e+08
## 1st Qu.: 3.735 1st Qu.: 5.35 1st Qu.:1.117e+10 1st Qu.:1.094e+10
## Median : 5.960 Median :13.50 Median :3.967e+10 Median :4.009e+10
## Mean : 8.597 Mean :19.61 Mean :4.660e+11 Mean :4.864e+11
## 3rd Qu.:10.958 3rd Qu.:30.23 3rd Qu.:2.476e+11 3rd Qu.:2.457e+11
## Max. :36.442 Max. :75.80 Max. :2.143e+13 Max. :2.171e+13
## Clean fuels and cooking technologies Per Capita
## Min. : 0.00 Min. : 228.2
## 1st Qu.: 34.50 1st Qu.: 2165.3
## Median : 80.70 Median : 6624.8
## Mean : 65.98 Mean : 16821.0
## 3rd Qu.:100.00 3rd Qu.: 19439.7
## Max. :100.00 Max. :175813.9
## Mortality caused by road traffic injury Tuberculosis Incidence
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 8.20 1st Qu.: 12.0
## Median :16.00 Median : 46.0
## Mean :17.06 Mean :103.8
## 3rd Qu.:24.00 3rd Qu.:138.5
## Max. :64.60 Max. :654.0
## DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## Min. :35.00 Min. :35.00 Min. :37.00 Min. : 0.200
## 1st Qu.:85.69 1st Qu.:81.31 1st Qu.:84.85 1st Qu.: 1.301
## Median :92.00 Median :91.00 Median :92.00 Median : 2.570
## Mean :87.99 Mean :86.76 Mean :87.31 Mean : 2.997
## 3rd Qu.:97.00 3rd Qu.:96.00 3rd Qu.:96.50 3rd Qu.: 3.773
## Max. :99.00 Max. :99.00 Max. :99.00 Max. :13.710
## Basic sanitation services Tuberculosis treatment Urban population
## Min. : 8.632 Min. : 0.00 Min. : 13.25
## 1st Qu.: 62.919 1st Qu.: 73.00 1st Qu.: 41.92
## Median : 91.144 Median : 82.00 Median : 58.76
## Mean : 77.380 Mean : 77.57 Mean : 59.12
## 3rd Qu.: 98.582 3rd Qu.: 88.00 3rd Qu.: 78.02
## Max. :100.000 Max. :100.00 Max. :100.00
## Rural population Non-communicable Mortality Sucide Rate continent
## Min. : 0.00 Min. : 4.40 Min. : 0.300 Africa :52
## 1st Qu.:21.98 1st Qu.:11.85 1st Qu.: 2.050 Americas:38
## Median :41.24 Median :17.20 Median : 3.500 Asia :42
## Mean :40.88 Mean :17.05 Mean : 4.802 Europe :48
## 3rd Qu.:58.08 3rd Qu.:22.10 3rd Qu.: 6.600 Oceania :15
## Max. :86.75 Max. :43.70 Max. :30.100
library(plotly)
## Warning: пакет 'plotly' был собран под R версии 4.3.2
##
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
##
## last_plot
## Следующий объект скрыт от 'package:stats':
##
## filter
## Следующий объект скрыт от 'package:graphics':
##
## layout
plot_ly(data = life_expectancy_data[(life_expectancy_data$`Rural population` != 0) & (life_expectancy_data$`Sucide Rate` != 0),],
x = ~ `Rural population`,
y = ~ `Sucide Rate`,
color = ~ continent)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Life expectancy между группами стран Африки и Америки. Вид
статистического теста определите самостоятельно. Визуализируйте
результат через библиотеку rstatix.Проведем тест Колмогорова-Смирнова.
library(rstatix)
##
## Присоединяю пакет: 'rstatix'
## Следующий объект скрыт от 'package:stats':
##
## filter
library(ggpubr)
africa_data <- life_expectancy_data %>%
filter(continent == 'Africa')
americas_data <- life_expectancy_data %>%
filter(continent == 'Americas')
ks_test_result <- ks.test(africa_data$`Life expectancy`, americas_data$`Life expectancy`)
ggboxplot(data = rbind(africa_data, americas_data), x = "continent", y = "Life expectancy",
title = "Comparison of Life Expectancy Distributions",
ylab = "Life Expectancy",
color = "continent") +
stat_compare_means(label = "p.format") +
theme_pubr()
Year. Сделайте корреляционный анализ этих
данных. Постройте два любых типа графиков для визуализации
корреляций.LED_numeric <- life_expectancy_data %>%
select_if(is.numeric) %>%
select(-Year)
library(corrplot)
## corrplot 0.92 loaded
LED_numeric_cor <- cor(LED_numeric)
corrplot(LED_numeric_cor, method = 'color', tl.cex = 0.8, tl.col = "blue", tl.srt = 45, order = 'AOE')
library(corrr)
## Warning: пакет 'corrr' был собран под R версии 4.3.2
rplot(LED_numeric_cor)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
LED_numeric_scaled <- scale(LED_numeric)
distance_matrix <- dist(LED_numeric_scaled)
hc <- hclust(distance_matrix, method = "ward.D2")
fviz_dend(hc, cex = 0.5, k = 5, k_colors = "jco", type = "circular")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(pheatmap)
pheatmap(LED_numeric_scaled,
show_rownames = FALSE,
clustering_distance_rows = distance_matrix,
clustering_method = "ward.D2",
cutree_rows = 5,
cutree_cols = length(colnames(LED_numeric_scaled)),
angle_col = 45,
main = "Dendrograms for clustering rows and columns with heatmap")
library(FactoMineR)
LED_numeric.pca <- prcomp(LED_numeric,
scale = T)
summary(LED_numeric.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.7526 1.4841 1.3952 1.17177 1.08375 0.96347 0.9288
## Proportion of Variance 0.3988 0.1159 0.1025 0.07227 0.06182 0.04886 0.0454
## Cumulative Proportion 0.3988 0.5147 0.6172 0.68945 0.75126 0.80012 0.8455
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.85740 0.69263 0.68937 0.59106 0.54986 0.47085 0.36596
## Proportion of Variance 0.03869 0.02525 0.02501 0.01839 0.01591 0.01167 0.00705
## Cumulative Proportion 0.88421 0.90946 0.93447 0.95286 0.96877 0.98044 0.98749
## PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.34546 0.26941 0.20224 0.06968 1.017e-15
## Proportion of Variance 0.00628 0.00382 0.00215 0.00026 0.000e+00
## Cumulative Proportion 0.99377 0.99759 0.99974 1.00000 1.000e+00
plotly. Желательно, чтобы при
наведении на точку, вы могли видеть название страны.library(ggbiplot)
## Загрузка требуемого пакета: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Присоединяю пакет: 'plyr'
## Следующий объект скрыт от 'package:ggpubr':
##
## mutate
## Следующие объекты скрыты от 'package:rstatix':
##
## desc, mutate
## Следующие объекты скрыты от 'package:plotly':
##
## arrange, mutate, rename, summarise
## Следующие объекты скрыты от 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Следующий объект скрыт от 'package:purrr':
##
## compact
## Загрузка требуемого пакета: scales
##
## Присоединяю пакет: 'scales'
## Следующий объект скрыт от 'package:purrr':
##
## discard
## Следующий объект скрыт от 'package:readr':
##
## col_factor
## Загрузка требуемого пакета: grid
fig <- ggbiplot(LED_numeric.pca,
scale=0, alpha = 0.1) +
theme_minimal()
library(plotly)
ggplotly(fig)
DPT Immunization, HepB3 Immunization, Measles Immunization исходя из графика целесообразно объединить в одну компоненту - Immunization. Life expectancy, Basic sanitation services, Clean fuels and cooking technologies - тоже сильно коррелируют.